---
title: "jaccard"
output: html_notebook
---




```{r eval=FALSE, include=FALSE}
require(devtools)
install_version("quanteda", version = "1.2.0", repos = "http://cran.us.r-project.org")
```


```{r, message=FALSE}
library(readtext)
library(quanteda)
library(tidyverse)
library(stm)
library(tidytext)
library(haven)
library(data.table)
library(ggridges)
library(viridis)
```


UNGD data are available on the Harvard Dataverse at https://doi.org/10.7910/DVN/0TJX8Y


```{r}

DATA_DIR <- "~/Dropbox/Research/UNGDC projects/UN Data/" 

ungd_files <- readtext(paste0(DATA_DIR, "TXT/*"), 
                                 docvarsfrom = "filenames", 
                                 dvsep="_", 
                                 docvarnames = c("Country", "Session", "Year"))


ungd_corpus <- corpus(ungd_files, text_field = "text") 

```



#Analysis


```{r EU president, include=FALSE}
presidency <- readxl::read_excel("../presidency.xlsx")

```


```{r similarity, include=FALSE}
#EU president
pres_similarity <- data.frame()

for (i in c(1970:2017)) {
  
#Creating corpus for each year
ungdc.i <- corpus_subset(ungd_corpus, Year==i)

tok <- tokens(ungdc.i, what = "word",
              remove_punct = TRUE,
              remove_symbols = TRUE,
              remove_numbers = TRUE,
              remove_twitter = TRUE,
              remove_url = TRUE,
              remove_hyphens = TRUE,
              verbose = TRUE)

# stemming re-introduced 
dfm <- dfm(tok, 
           tolower = TRUE,
           remove=stopwords("english"),
           stem=TRUE, 
           verbose = TRUE)

#Removing any digits. `dfm` picks up any separated digits, not digits that are part of tokens.
#Removing any punctuation. `dfm` picks up any punctuation unless it's part of a token.
#Removing any tokens less than four characters.
dfm.m <- dfm_select(dfm, c("[\\d-]", "[[:punct:]]", "^.{1,3}$"), selection = "remove", 
                    valuetype="regex", verbose = TRUE)


#Dropping words that appear less than 5 times and in less than 3 documents.
dfm.trim <- dfm_trim(dfm.m, min_termfreq = 5, min_docfreq = 3)

#tfidf weighting
dfm.w <- dfm_tfidf(dfm)

#holders for country names in distance measures below
pres <- paste0(presidency$Country_alt[presidency$Year==i])

#Cosine similarity calculations

doc <- paste0(dfm.w@Dimnames$docs[dfm.w@docvars$Country==pres])

similarities.i <- as.data.frame(as.list(textstat_simil(dfm.w, dfm.w@Dimnames$docs[dfm.w@docvars$Country==pres],margin = "documents", method = "ejaccard"), sorted = FALSE)[1])

names(similarities.i)[1] <- "PRES"

similarities.i$Year <- i

pres_similarity <- rbind(pres_similarity,similarities.i)

}
```

```{r similarity adding country names }
pres_similarity$Country <- rownames(pres_similarity)

pres_similarity$Country <- str_replace(pres_similarity$Country , ".txt", "") %>%
  str_replace(. , "_\\d{4}", "") %>%
  str_replace(. , "_\\d{2}", "")
```



```{r}
#Logicals for EU member states
EU <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD")
wave1 <- c("DNK", "IRL", "GBR")
wave2 <- "GRC" 
wave3 <- c("ESP", "PRT") 
wave4 <- c("AUT", "FIN", "SWE") 
wave5 <- c("CZE", "HUN", "POL", "EST", "LVA", "LTU", "CYP", "MLT", "SVK", "SVN")
wave6 <- c("BGR", "ROU") 
wave7 <- "HRV" 
```





```{r}
simil <- pres_similarity

simil <- simil %>% mutate(is.eu = Country %in% EU)

simil$waves <- simil$is.eu

#first wave
simil$is.eu[simil$Country== "DNK" & simil$Year >1972] <- TRUE
simil$is.eu[simil$Country== "IRL" & simil$Year >1972] <- TRUE
simil$is.eu[simil$Country== "GBR" & simil$Year >1972] <- TRUE

#second wave
simil$is.eu[simil$Country== "GRC" & simil$Year >1980] <- TRUE

#third wave
simil$is.eu[simil$Country== "ESP" & simil$Year >1985] <- TRUE
simil$is.eu[simil$Country== "PRT" & simil$Year >1985] <- TRUE

#fourth wave
simil$is.eu[simil$Country== "AUT" & simil$Year >1994] <- TRUE
simil$is.eu[simil$Country== "FIN" & simil$Year >1994] <- TRUE
simil$is.eu[simil$Country== "SWE" & simil$Year >1994] <- TRUE

#fifth wave
simil$is.eu[simil$Country== "CZE" & simil$Year >2003] <- TRUE
simil$is.eu[simil$Country== "HUN" & simil$Year >2003] <- TRUE
simil$is.eu[simil$Country== "POL" & simil$Year >2003] <- TRUE
simil$is.eu[simil$Country== "EST" & simil$Year >2003] <- TRUE
simil$is.eu[simil$Country== "LVA" & simil$Year >2003] <- TRUE
simil$is.eu[simil$Country== "LTU" & simil$Year >2003] <- TRUE
simil$is.eu[simil$Country== "CYP" & simil$Year >2003] <- TRUE
simil$is.eu[simil$Country== "MLT" & simil$Year >2003] <- TRUE
simil$is.eu[simil$Country== "SVK" & simil$Year >2003] <- TRUE
simil$is.eu[simil$Country== "SVN" & simil$Year >2003] <- TRUE

#sixth wave
simil$is.eu[simil$Country== "BGR" & simil$Year >2006] <- TRUE
simil$is.eu[simil$Country== "ROU" & simil$Year >2006] <- TRUE

#seventh wave
simil$is.eu[simil$Country== "HRV" & simil$Year >2012] <- TRUE
```


```{r}
simil <-  mutate(simil, eu6 = Country %in% EU)
#simil <-  mutate(simil, wave1 = Country %in% wave1)
#simil <-  mutate(simil, wave2 = Country %in% wave2)
#simil <-  mutate(simil, wave3 = Country %in% wave3)
#simil <-  mutate(simil, wave4 = Country %in% wave4)
simil <-  mutate(simil, wave5 = Country %in% wave5)
#simil <-  mutate(simil, wave6 = Country %in% wave6)
#simil <-  mutate(simil, wave7 = Country %in% wave7)
```

```{r}
simil <-  mutate(simil, eu9 = eu6)
simil$eu9[simil$Country== "DNK" & simil$Year >1972] <- TRUE
simil$eu9[simil$Country== "IRL" & simil$Year >1972] <- TRUE
simil$eu9[simil$Country== "GBR" & simil$Year >1972] <- TRUE

simil <-  mutate(simil, eu12 = eu9)
simil$eu12[simil$Country== "GRC" & simil$Year >1980] <- TRUE
simil$eu12[simil$Country== "ESP" & simil$Year >1985] <- TRUE
simil$eu12[simil$Country== "PRT" & simil$Year >1985] <- TRUE

simil <-  mutate(simil, eu15 = eu12)
simil$eu15[simil$Country== "AUT" & simil$Year >1994] <- TRUE
simil$eu15[simil$Country== "FIN" & simil$Year >1994] <- TRUE
simil$eu15[simil$Country== "SWE" & simil$Year >1994] <- TRUE

```


```{r}
readr::write_csv(simil, "jaccard_similarity.csv")
```


